*** This do file creates the instrument, edited from Alex He's code to run with Census files ****

set more off
global projectdir "~"
global datadir "$projectdir/data"
global intermed "$projectdir/data/intermediate_files"
global rawdata "$projectdir/data/raw_pulls/umetrics/FSRDC_2018"
global alex_transfer  "$projectdir/data/transfer"

****************************************
*******aI. RENAME VARIABLES TO BE SAME AS IN VDE************
****************************************
** Award
use "$rawdata/um_iris_uer_2018_rawd_rsch.dta", clear
rename institutionid  institution_id 
rename uniqueawardnumber unique_award_number
rename fedawdnumstd fed_awd_num_std
rename fundingsourcenameraw funding_source_name_raw
rename fundingsourcenameclean funding_source_name_clean
rename awardtitle  award_title
rename periodstartdate period_start_date
rename periodenddate period_end_date
rename recipientaccountnumber recipient_account_number
rename totaldirectexpenditures total_direct_expenditures
rename overheadcharged overhead_charged
rename campusid campus_id
rename suborgunit sub_org_unit

save "$intermed/core_award2019q1.dta", replace

** Employee
use "$rawdata/um_iris_uer_2018_remp_rsch.dta", clear
rename institutionid  institution_id 
rename empnumber emp_number
rename uniqueawardnumber unique_award_number
rename fedawdnumstd fed_awd_num_std
rename periodstartdate period_start_date
rename periodenddate period_end_date
rename recipientaccountnumber recipient_account_number
rename objectcode object_code
rename jobtitle job_title
rename occupationalclass occupational_class
rename umetricsoccupationalclass umetrics_occupational_class
rename soccode soc_code
rename ftestatus fte_status
rename proportionearningsallocated proportion_earnings_allocated

save "$intermed/core_employee2019q1.dta", replace

****************************************
*******I. CLEAN RAW DATA FILES************
****************************************

/* Clean awards file to get funding source */

use "$intermed/core_award2019q1.dta", clear
ren institution_id submit_university
drop if recipient==""&(unique_award==""|unique_award=="#N/A")
replace overhead=0 if overhead<0
replace total_direct=0 if total_direct<0
drop funding_source_name_c
gsort period_start period_end unique_award submit_university funding_source recipient -sub_org
collapse (sum) overhead total_direct (first) sub_org, by (period_start period_end unique_award submit_university funding_source recipient)
*if an award has multiple source in a period, keep the source with the highest amount
bysort submit_university unique_award recipient period_start period_end: gen count=_N
drop if count>1&funding_source==""
gsort submit_university unique_award recipient period_start period_end -total_direct
by submit_university unique_award recipient period_start period_end: gen order=_n 
keep if order==1
gen byte flag_multiple_source=(count>1)
drop order count

bysort submit_university unique_award recipient: gen count1=_N
bysort submit_university unique_award recipient funding_source: gen count2=_N
gen byte singlesource_recipient= (count1==count2)
bysort submit_university unique_award: gen count3=_N
bysort submit_university unique_award funding_source: gen count4=_N
gen byte singlesource_award= (count3==count4)
drop count*
save "$intermed/funding_source", replace

use "$intermed/funding_source", clear
keep if singlesource_recipient==1
keep submit_university unique_award recipient funding_source
duplicates drop
ren funding_source fundingsource2
save "$intermed/funding_source_recipient", replace

use "$intermed/funding_source", clear
keep if singlesource_award==1
keep submit_university unique_award funding_source
duplicates drop
ren funding_source fundingsource3
save "$intermed/funding_source_award", replace

use "$intermed/funding_source", clear
gsort submit_u unique -sub_org
collapse (first) sub_org , by (submit unique)
save "$intermed/award_field", replace

/* Clean employee file */

use "$intermed/core_employee2019q1.dta", clear
ren institution_id submit_university
ren emp_number new_emp
drop object_code proportion job_title soc_code fte occupation fed_awd umetrics_occ
duplicates drop

*get funding source information from awards data
merge m:1 submit_university unique_award recipient period_start period_end/*
*/ using "$intermed/funding_source", keep(1 3) nogen
merge m:1 submit_university unique_award recipient /*
*/ using "$intermed/funding_source_recipient", keep(1 3) nogen
merge m:1 submit_university unique_award /*
*/ using "$intermed/funding_source_award", keep(1 3) nogen
replace funding_source=fundingsource2 if funding_source==""
replace funding_source=fundingsource3 if funding_source==""
drop overhead-fundingsource3

merge m:1 cfda using "$alex_transfer/cfda- agency.dta",/*
*/ keep(1 3) keepus(agency agency_abb program_title) nogen

keep period* unique_award submit_u cfda funding_source agency* new_emp
duplicates drop
compress

gen year_start=year(period_start)
gen year_end=year(period_end)

unique submit unique_award
unique submit new_emp
unique submit new_emp year_end
unique submit new_emp unique_award period_start period_end

drop if funding_source==""&cfda==""&unique=="#N/A"
compress
*drop 961,683 observations that cannot be identified a funding source of CFDA

gen byte federal_clear=(cfda!=""&cfda!="00.000"&cfda!="99.999"&agency!=0&agency!=.&agency_a!="UNKNOWN")

gen funding_type="STATE" if substr(agency_ab,1,5)=="STATE"
replace funding_type="FEDERAL" if federal_clear==1
replace funding_type="FEDERAL" if agency_ab=="FEDERAL"
replace funding_type="PRIVATE-FORPROFIT" if agency_ab=="FORPROFIT"
replace funding_type="PRIVATE-NONPROFIT" if agency_ab=="NONPROFIT"
replace funding_type="FOREIGN" if agency_ab=="FOREIGN"
replace funding_type="STATE" if agency_ab=="LOCAL"
replace funding_type="UNIVERSITY" if agency_ab=="UNIVERSITY"
replace funding_type="PRIVATE" if agency_ab=="INVEST"|agency_ab=="HOSPITAL"

gen funding_type_source="CFDA" if funding_type!=""

gen temp_agency=substr(unique,1,2)
destring temp_agency, replace force
*count if funding_type==""&temp_agency!=.&substr(unique,3,1)=="."&substr(unique,1,6)!="00.000"
replace funding_type="FEDERAL" if funding_type==""&temp_agency!=.&substr(unique,3,1)=="."/*
*/&substr(unique,1,6)!="00.000"&temp_agency>9&temp_agency<99&temp_agency!=89
replace cfda=substr(unique,1,6) if cfda==""&funding_type=="FEDERAL"&substr(unique,7,1)==" "
replace cfda=substr(unique,1,5) if cfda==""&funding_type=="FEDERAL"&substr(unique,6,1)==" "
replace cfda=substr(unique,1,4) if cfda==""&funding_type=="FEDERAL"&substr(unique,5,1)==" "
replace funding_type="PRIVATE-NONPROFIT" if cfda==""&funding_type_source!="CFDA"&funding_type=="FEDERAL"&temp_agency==31
replace funding_type_source="CFDA" if funding_type!=""

keep if funding_source!=""|funding_type!=""

unique submit unique_award
unique submit new_emp
unique submit new_emp year_end
unique submit new_emp unique_award period_start period_end

replace funding_source=upper(funding_source)
do "$projectdir/code/UMETRICS/identify_funding_type_from_source_7_1.do" 

drop temp*
drop funding_source
bysort new_emp submit_u period_start period_end unique: gen count=_N
drop if count>1
drop count
compress
save "$intermed/employee_cleaned.dta", replace


****************************************
*******II. CONSTRUCT INSTRUMENT************
****************************************

*get funding amount by award by year*

use "$intermed/core_award2019q1.dta", clear
ren institution_id submit_university
drop if recipient==""&(unique_award==""|unique_award=="#N/A")
replace overhead=-overhead if overhead<0
replace total_direct=-total_direct if total_direct<0
collapse (sum) overhead total_direct, by (period_start period_end unique_award submit_u)
drop if unique==""
save "$intermed/temp_award_amount_period.dta", replace

*****************************************************
*construct employee level shock for multiple CFDA employees*
*****************************************************

use "$intermed/core_employee2019q1.dta", clear
ren emp_num new_emp
ren institution_id submit_university
merge m:1 cfda using "$alex_transfer/cfda_all", keep(3) nogen keepus(cfda)
egen cfda_num=group(cfda)
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
drop if proportion<=0&ncfda>1
cap drop ncfda
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
drop if proportion==.&ncfda>1
collapse (max) proportion (mean) cfda_num, by (new_emp submit_u unique period_start period_end cfda)
replace proportion=1 if proportion>1&proportion<.
collapse (sum) proportion (mean) cfda_num, by (new_emp submit_u cfda)
bysort new_emp submit_u: egen temp_total=sum(proportion)
replace proportion=proportion/temp_total
bysort new_emp submit_u: egen ncfda=nvals(cfda_num)
replace proportion=1 if ncfda==1
bysort new_emp submit_u: egen temp_total2=sum(proportion)
drop if proportion==.&ncfda>1
assert temp_total2>0.99999&temp_total2<=1
drop temp*

joinby cfda using "$alex_transfer/singleaudit_cfda"
foreach var in amount amount_rd amount_nonrd {
replace `var'=proportion*log(`var')
}
collapse (sum) amount* (mean) ncfda, by(new_emp submit_u year)

compress
egen emp_num=group(new_emp)
tsset emp_num year
gen d_amountrd=amount_rd-L.amount_rd
gen d_amountrd_lag1=L.amount_rd-L2.amount_rd
gen d_amountrd_lag2=L2.amount_rd-L3.amount_rd
gen d_amountrd_lag3=L3.amount_rd-L4.amount_rd
gen d_amountrd_lag4=L4.amount_rd-L5.amount_rd
drop emp_num
save "$intermed/shock_multicfda_employee", replace

*****************************************************
***Generate instruments for multi-CFDA employees****
*****************************************************
use "$intermed/core_employee2019q1.dta", clear
gen byte faculty=(umetrics_occ=="Faculty"|umetrics_occ=="faculty")
ren emp_num new_emp
ren institution_id submit_university
merge m:1 unique submit using "$intermed/award_field", keep(1 3) nogen

collapse (max) fte proportion faculty (first) umetrics_occ sub_org, by (new_emp submit_u unique period_start period_end cfda)
merge 1:1 new_emp submit_university unique period_start period_end using "$intermed/employee_cleaned"/*
*/, keep(1 3) keepus(funding_type) nogen

bysort period_start period_end unique submit_u: egen total_fte=sum(fte)

merge m:1 period_start period_end unique_award submit_university using "$intermed/temp_award_amount_period", nogen keep(1 3)

replace overhead=overhead/total_fte*fte
replace total_direct=total_direct/total_fte*fte
gen overhead_federal=overhead if funding_type=="FEDERAL"
gen total_direct_federal=total_direct if funding_type=="FEDERAL"
gen overhead_private=overhead_ch if substr(funding_type,1,3)=="PRI"
gen total_direct_private=total_direct_ex if substr(funding_type,1,3)=="PRI"
gen total_direct_forprofit=total_direct_ex if funding_type=="PRIVATE-FORPROFIT"
gen total_direct_nonprofit=total_direct_ex if funding_type=="PRIVATE-NONPROFIT"
gen year_end=year(period_end)

preserve
collapse (sum) total_direct_expenditures, by (new_emp submit_u sub_org)
drop if sub_org==""
gsort new_emp -total_direct_expenditures
collapse (first) sub_org submit_u, by(new_emp)
save "$intermed/employee_field", replace
restore

collapse (sum) overhead* total_direct* (max) faculty (first) umetrics_occ, by (new_emp submit_u year_end)

*create balanced panel
drop if year==1900
egen emp_num=group(new_emp)
tsset emp_num year
tsfill, full

preserve
gsort emp_num -new_emp
collapse (first) submit_u new_emp faculty umetrics_occ, by(emp_num)
save "$intermed/emp_num_crosswalk", replace
restore

drop submit_u new_emp faculty umetrics_occ
merge m:1 emp_num using "$intermed/emp_num_crosswalk", nogen

foreach var in overhead_c overhead_f overhead_p total_direct_e total_direct_fe total_direct_p total_direct_fo total_direct_n {
replace `var'=0 if `var'==.
}

ren year_end year
merge 1:1 submit_u new_emp year using "$intermed/shock_multicfda_employee", keep(1 3) nogen

tsset emp_num year
merge m:1 new_emp using "$intermed/employee_field", keep(1 3) nogen
ren submit_u institution_id
merge m:1 institution_id sub_org_unit using "$alex_transfer/sub_org_singlecfda_fields_classified_new"/*
*/, keep(1 3) keepus(field) nogen
replace field="other" if field==""
ren institution_id submit_university
ren new_emp iris_employee_number
save "$intermed/final_multicfda.dta", replace
//This is precicely same dataset as in VDE at this stage

egen fieldid=group(field)
egen uni_suborg=group(submit sub_org)
gen log_expenditure=log(total_direct_ex)
tsset emp_num year

ren total_direct_ex total_direct_expenditure

foreach soc in federal private forprofit {
gen share_`soc'=total_direct_`soc'/total_direct_expenditure
gen share_`soc'_3year=(total_direct_`soc'+L.total_direct_`soc'+L2.total_direct_`soc')/ /*
*/(total_direct_expenditure+L.total_direct_expenditure+L2.total_direct_expenditure)
gen share_`soc'_2year=(total_direct_`soc'+L.total_direct_`soc')/ /*
*/(total_direct_expenditure+L.total_direct_expenditure)
replace share_`soc'_2year=share_`soc' if share_`soc'_2year==.
replace share_`soc'_3year=share_`soc'_2year if share_`soc'_3year==.
}

*drop universities with no private funding
bysort submit year: egen max_shprivate=max(share_private)

gen byte private=(share_private>=0.5)
replace private=. if share_private==.
gen byte forprofit=(share_forprofit>=0.5)
replace forprofit=. if share_forprofit==.

global sortvar "L.share_private L.share_forprofit L.share_federal L.private L.forprofit"

*winsorize
foreach var in log_expenditure d_amountrd d_amountrd_lag1 d_amountrd_lag2 d_amountrd_lag3 {
sum `var',d
replace `var'=r(p99) if `var'>r(p99)&`var'<.
replace `var'=r(p1) if `var'<r(p1)
}
save "$intermed/final_multicfda.dta", replace

